% -*- Mode: Tex -*-
\documentstyle[12pt]{article}
\marginparwidth 0pt
\oddsidemargin  0.125in
\evensidemargin  0.125in
\marginparsep 0pt
\topmargin   -0.3in
\textwidth   6.25in
\textheight  9.0 in
%\input{commands}
% -*- Mode: TeX -*-
% commands.tex
% This file contains useful command declarations for latex documents.
% Use "\input{commands}" before "\begin{document}" in your file.
% NASA Logo.
% \newcommand{\NASA}{\nasa NASA}
% Mathematical constructs
\newcommand{\given}{\!\mid\!}
\newcommand{\diff}[1]{\,d{#1}}
\newcommand{\determinant}[1]{\left| {#1} \right|}
\newcommand{\abs}[1]{\left| {#1} \right|}
\newcommand{\mean}[1]{\overline{#1}}
\newcommand{\geomean}[1]{\widetilde{#1}}
%\newcommand{\geomean}[1]{G\!M({#1}))}
\newcommand{\mvec}[1]{\widehat{#1}}
\newcommand{\bold}[1]{\mbox{\boldmath $ {#1} $}}
\newcommand{\eqtr}{\stackrel{\triangle}{=}}
\newcommand{\eqdef}{\stackrel{\rm def}{=}}
\newcommand{\union}{\cup}
\newcommand{\member}{\in}
\newcommand{\infinity}{\infty}
\newcommand{\pdf}{\wp}
% Referencing constructs
\newcommand{\Eqn}[1]{Equation~\ref{#1}}
\newcommand{\eqn}[1]{equation~\ref{#1}}
\newcommand{\Tab}[1]{Table~\ref{#1}}
\newcommand{\tab}[1]{table~\ref{#1}}
\newcommand{\Fig}[1]{Figure~\ref{#1}}
\newcommand{\fig}[1]{figure~\ref{#1}}
\newcommand{\Sect}[1]{Section~\ref{#1}}
\newcommand{\sect}[1]{section~\ref{#1}}
\newcommand{\Page}[1]{Page~\pageref{#1}}
\newcommand{\page}[1]{page~\pageref{#1}}
% Abbreviations
\newcommand{\etc}{etc.\ }
\newcommand{\ie}{i.e.\ }
\newcommand{\eg}{e.g.\ }
\newcommand{\AI}{A.I.\ }
% Other commands
\newcommand{\banner}[1]{
  \pagestyle{empty}
  \setcounter{page}{0}
  \mbox{}
  \vspace{3.0in}
  \begin{center}
    {\Huge{#1}}\\
    \vspace{1.0in}
    {\LARGE \today}
  \end{center}
  \newpage
  \pagestyle{plain}}
% end of commands.tex
\hyphenation{data-base}
\hyphenation{lan-guage}
\hyphenation{clas-si-fi-ca-tion}
\newcommand{\spc}	{\vspace{1pc}}
\newcommand{\Th}	{{\mbox{Th}}}
\newcommand{\fignum}	{\mbox{Fig.\ }}
\newcommand{\xinj}	{\xsubi \in \mbox{class } j}
\newcommand{\xsubik}	{\mbox{$x_{ik}$}}	% data descriptors
\newcommand{\xsubi}	{\mbox{$\vec{x}_i$}}
\newcommand{\xset}	{\mbox{$\{\xsubi\}$}}
\newcommand{\thetasubjkl}{\mbox{$\theta_{jkl}$}}	% model parameters
\newcommand{\Pthetasubjkl}{\mbox{${\scriptstyle p}\theta_{jkl}$}}
\newcommand{\thetasubjk} {\mbox{$\vec{\theta}_{jk}$}}
\newcommand{\Pthetasubjk}{\mbox{${\scriptstyle p}\vec{\theta}_{jk}$}}
\newcommand{\thetasubj}	{\mbox{$\vec{\theta}_j$}}
\newcommand{\Pthetasubj}{\mbox{${\scriptstyle p}\vec{\theta}_j$}}
\newcommand{\musubjk}	{\mbox{$\mu_{jk}$}}
\newcommand{\Pmusubjk}	{\mbox{${\scriptstyle p}\mu_{jk}$}}
\newcommand{\sigmasubjk}{\mbox{$\sigma_{jk}$}}
\newcommand{\Psigmasubjk}{\mbox{${\scriptstyle p}\sigma_{jk}$}}
\newcommand{\Pwtsubjk}	{\mbox{${\scriptstyle p} wt_{jk}$}}
\newcommand{\knownjk}	{\mbox{$p(\mbox{known}_{jk})$}}
%\newcommand{\Pknownjk}	{\mbox{${\scriptstyle p}pk_{jk}$}}
\newcommand{\unknownjk}	{\mbox{$p(\mbox{unknown}_{jk})$}}
%\newcommand{\Punknownjk}{\mbox{${\scriptstyle p}pu_{jk}$}}
\newcommand{\pisubj}	{\mbox{$\pi_j$}}
\newcommand{\Ppisubj}	{\mbox{${\scriptstyle p}\pi_j
						     $}}
\newcommand{\piandthetaset}{\mbox{$\{\pisubj, \thetasubj\}$}}
\newcommand{\Ppiandthetaset}{\mbox{$\{\Ppisubj, \Pthetasubj\}$}}
\newcommand{\probsubij}	{\mbox{$p_{ij}$}}
\newcommand{\Pprobsubij}{\mbox{${\scriptstyle p} p_{ij}$}}
\begin{document}
\renewcommand{\thefootnote}{\fnsymbol{footnote}}
\begin{center}
  { \Large
     AutoClass: A Bayesian Classification System} \\
\end{center}
\vspace{0.1in}
\begin{flushleft}
  \small
  PETER CHEESEMAN%
    \footnote[1]{RIACS.  This work partially supported by NASA grant
                 NCC2--428.}
    \hfill (CHEESEMAN@PLUTO.ARC.NASA.GOV) \\
  JAMES KELLY%
    \footnote[2]{Sterling Software
                 (Don Freeman is now at the University of Pittsburgh)}
    \hfill (KELLY@PLUTO.ARC.NASA.GOV) \\
  MATTHEW SELF%
    \footnotemark[2]
    \hfill (SELF@PLUTO.ARC.NASA.GOV) \\
  JOHN STUTZ%
    \footnote[3]{NASA Ames Research Center}
    \hfill (STUTZ@PLUTO.ARC.NASA.GOV) \\
  WILL TAYLOR%
    \footnotemark[2]
    \hfill (TAYLOR@PLUTO.ARC.NASA.GOV) \\
  DON FREEMAN%
    \footnotemark[2]
\end{flushleft}
\vspace{0.15in}
\noindent  
  NASA Ames Research Center\\
\hspace{\parindent}
  Mail Stop 244-17,
  Moffett Field, CA 94035 U.S.A.
\vspace{0.15in}
\begin{center}
  Presented at the Fifth International Conference on Machine Learning.\\
\vspace{0.15in}
  Modified June 14, 1988.
\end{center}
\renewcommand{\thefootnote}{\arabic{footnote}}
\setcounter{footnote}{0}
\subsection*{Abstract}
  This paper describes AutoClass~II, a program for automatically
discovering (inducing) classes from a database, based on a Bayesian
statistical technique which automatically determines the most probable
number of classes, their probabilistic descriptions, and the
probability that each object is a member of each class.  AutoClass has
been tested on several large, real databases and has discovered
previously unsuspected classes.  There is no doubt that these classes
represent new phenomena.
\pagestyle{empty}
%\input{intro}
% -*- Mode: TeX -*-
\section{Introduction}
\label{intro section}
  The standard approach in much of AI and statistical pattern
recognition research is that a classification consists of a
partitioning of the data into separate subsets, and that these subsets
{\em are\/} the classes.  In the Bayesian approach classes are
described by probability distributions over the attributes of the
objects, specified by a model function and its parameters.  To define
a class is to describe (not list) the objects which belong to it.
This approach appears in the statistical literature as the theory of
finite mixtures \cite{everitt81b}.
The Bayesian approach has several advantages over other methods: %
\nopagebreak
\begin{description}
\item[$\bullet$ The number of classes is determined automatically.] \mbox{}
\nopagebreak
Deciding when to stop forming classes is a fundamental problem in
classification.  More classes can always explain the data better, so
what should limit the number of classes the program finds?  Many
systems rely on an {\em ad hoc\/} stopping criterion.  The Bayesian
solution to the problem lies in the use of prior knowledge.  We
believe simpler class hypotheses (e.g., those with fewer classes) to
be more likely than complex ones, in advance of seeing any data, and
the {\em prior probability} of the hypothesis reflects this preference.
 The prior probability term
prefers fewer classes, the likelihood of the data prefers more, and
the two effects balance at the most probable number of classes.  As a
result, AutoClass finds only one class in random data.
\item[$\bullet$ Objects are not assigned to classes absolutely.] \mbox{}
\nopagebreak
  AutoClass calculates the probability of each object's membership in
each class, providing a more intuitive classification than absolute
partitioning techniques.  An object described equally well by two
class descriptions should not be assigned to either class with certainty,
because the evidence cannot support such an assertion.
\pagestyle{myheadings}
\markright{AutoClass: A Bayesian Classification System}
\item[$\bullet$ All attributes are potentially significant.] \mbox{}
\nopagebreak
  Classification can be based on any or all attributes simultaneously,
not on just the most important one.  This represents an advantage of
the Bayesian method over human classification.  In many applications,
classes are distinguished not by one or even by several attributes,
but by small differences in many.  Humans often have difficulty taking
more than a few attributes into account.  The Bayesian approach
utilizes all attributes simultaneously, permitting uniform
consideration of all the data.
\item[$\bullet$ Data can be real or discrete.] \mbox{}
\nopagebreak
  Many previous methods have difficulty analyzing mixed data.  Some
methods insist on real valued data \cite{dillon84}, while others
accept only discrete data \cite{fisher87}.  There have been attempts
to reconcile the two types of data by coercing real data into discrete
form \cite{wong87} or by incorporating flexible thresholds into
categorical classification \cite{quinlan87}.  Coercion of
heterogeneous data to a single type destroys information and is done
purely to meet the needs of the particular classification procedure.
The Bayesian approach can utilize the data exactly as they are given.
% \item[$\bullet$ No similarity measure is required.] \mbox{}
% \nopagebreak
% {\em Do we wish to continue with this claim, in view of what is said in
% \ref {theoretical priors section}?}
%   The Bayesian method produces the same classes regardless of the
% scale and origin chosen to record the data, because it uses no
% ``similarity measure,'' as cluster analysis \cite{duda73} does, or
% ``category quality,'' as conceptual clustering \cite{langley87} does.
% It is very difficult to construct such measures that are invariant
% under scale changes (for scalar variables) or origin shifts (for
% location variables).  The result is that different clusters can be
% obtained if the data are expressed in meters rather than inches, or if
% a different reference point is chosen---an undesirable property.
% AutoClass avoids the problem by using scale- and location-invariant
% priors (see section \ref{theoretical priors section}).
\end{description}
\pagestyle{myheadings}
\markright{AutoClass: A Bayesian Classification System}
%\input{theory}
% -*- Mode: Tex -*-
\section{Overview of Bayesian Classification}
\label{theory section}
  AutoClass is based on Bayes's theorem, a formula for combining
probabilities.  Given observed data $D$ and a hypothesis $H$, it
states that the probability that the hypothesis explains the data $p(
H \given D )$, (called the {\em posterior}\/ probability of the
hypothesis given the data) is proportional to the probability of
observing the data if the hypothesis were known to be true $p( D
\given H )$ (the {\em likelihood}\/ of the data) times the inherent
probability of the hypothesis regardless of the data $p(H)$ (the {\em
prior}\/ probability of the hypothesis).  Bayes's theorem is commonly
expressed %
\nopagebreak
\begin{equation}
	p( H \given D ) = \frac{ p(H) \: p( D \given H )}{ p(D) }.
					\label{bayes's theorem} 
\end{equation}
  For our purposes, the hypothesis $H$ is the number and descriptions
of the classes from which we believe the data $D$ to have been drawn.
Given $D$, we must select $H$ to maximize the posterior $p(H\given
  For a specific classification hypothesis, calculation of the
likelihood of the data involves a straightforward application of
statistics.  The prior probability of the hypothesis is less
transparent and is taken up in section \ref{theoretical priors
section}.  Finally, the prior probability of the data, $p(D)$ in the
denominator above, need not be calculated directly. It can be derived
as a normalizing constant or ignored so long as we seek only the
relative probabilities of hypotheses.
\subsection{Application to Classification}
\label{posterior section}
In the theory of finite mixtures (the mathematical foundation of
AutoClass) each datum in a database containing \( I \) objects is
assumed to be drawn from one of \( J \) classes.  Each class is
described by a {\em class \label{class distribution} distribution}
function, \( p(x_i \given x_i \member C_j, \vec{\theta}_j) \), which
gives the probability distribution of the attributes of a datum if it
were known to belong to class \( C_j \).  These class distributions
are described by a {\em class parameter vector},
\(\vec{\theta}_j \), which for a single attribute normal distribution
would consist of the class mean, \( \mu_j \), and variance, \(
\sigma_j^2 \).  
The probability of an object being drawn from class \( j
\) is called the {\em class probability} \( \pi_j \). Thus, the
probability of a given datum coming from a set of classes is the sum of
the probabilities that it came from each class separately, weighted by
the class probabilities.
\begin{equation}
  p(x_i \given \vec{\theta}, \vec{\pi}, J)
  = \sum_{j = 1}^J
      \pi_j \,
      p(x_i \given x_i \member C_j, \vec{\theta}_j).
							\label{pre-likelihood}
\end{equation}
We assume that the data are drawn from an exchangeable (static)
process---that is, the data are unordered and independent of each
other given the model.  Thus the {\em likelihood} of measuring an
entire database is the product of the probabilities of measuring each
object.
\begin{equation}
  p(\vec{x} \given \vec{\theta}, \vec{\pi}, J)
  = \prod_{i = 1}^I
      p(x_i \given \vec{\theta}, \vec{\pi}, J)
							\label{likelihood}
\end{equation}
For a given value of the class parameters, we can calculate the
probability that object $i$ belongs to class $j$ using Bayes's theorem.
\begin{equation}
  p(x_i \member C_j \given x_i, \vec{\theta}, \vec{\pi}, J)
  = {\pi_j \,
     p(x_i \given x_i \member C_j, \vec{\theta}_j)
     \over p(x_i \given \vec{\theta}, \vec{\pi}, J)} %
						\label{membership probability}
\end{equation}
These classes are ``fuzzy'' in the sense that even with perfect
knowledge of an object's attributes, it will be possible to
determine only the probability that it is a member of a given class.
We break the problem of identifying a finite mixture into two parts:
determining the classification parameters for a given number of
classes, and determining the number of classes.  Rather than seeking
an {\em estimator\/} of the classification parameters (the class
parameter vectors, \( \vec{\theta} \), and the class probabilities, 
\(\vec{\pi} \)), we seek their full {\em posterior}\/ probability distribution.
The posterior distribution is proportional to the product of the prior
distribution of the parameters \( p(\vec{\theta}, \vec{\pi} \given J) \)
and the likelihood function \( p(\vec{x} \given \vec{\theta},
\vec{\pi}, J) \).
\begin{equation}
  p(\vec{\theta}, \vec{\pi} \given \vec{x}, J)
  = {p(\vec{\theta}, \vec{\pi} \given J) \,
     p(\vec{x} \given \vec{\theta}, \vec{\pi}, J)
     \over p(\vec{x} \given J)}
						\label{conditional posterior}
\end{equation}
The pseudo-likelihood \( p(\vec{x} \given J) \) is simply the
normalizing constant
of the posterior distribution, obtained by marginalizing
(integrating) out the classification parameters---in effect, treating
them as ``nuisance'' parameters:
\begin{equation}
  p(\vec{x} \given J)
  = \int \! \! \int \!
      p(\vec{\theta}, \vec{\pi} \given J) \,
      p(\vec{x} \given \vec{\theta}, \vec{\pi}, J)
    \diff{\vec{\theta}} \diff{\vec{\pi}}.
						\label{pseudo likelihood}
\end{equation}
To solve the second half of the classification problem (determining
the number of classes) we calculate the posterior distribution of the
number of classes \( J \).  This is proportional to the product of the
prior distribution \( p(J) \) and the pseudo-likelihood function
\( p(\vec{x} \given J) \).
\begin{equation}
  p(J \given \vec{x})
  = {p(J) \, p(\vec{x} \given J)
     \over p(\vec{x})}
						\label{marginal posterior}
\end{equation}  
  In principle we can determine the most probable number of classes by
evaluating \mbox{$p(J \given \vec{x})$} over the range of $J$ for
which our prior $p(J)$ is significant.  In practice, the
multi-dimensional integrals of \eqn{pseudo likelihood} are
computationally intractable, and we must search for the maximum of the
function and approximate it about that point.  Details of the
AutoClass algorithm appear in section
\ref{implementation section}.
\subsection{Assumptions}
\label{assumptions section}
We cannot attempt classification without making some assumptions.  Our
mathematical formulation of the problem permits us to state our
assumptions precisely and to assess their validity.  The derivation
above incorporates two:
\nopagebreak
\begin{enumerate}
\item The data are independent given the model. That is, the data are
unordered. This is a fundamental assumption intrinsic to all
classification systems.
\item The model functions---{\em class 
distributions}\/ of page \pageref{class distribution}---are
appropriate descriptors of the classes.  The model functions
themselves may incorporate additional assumptions, such as
independence of attribute values (as AutoClass currently does).
\end{enumerate}
\subsection{Prior Probabilities}
\label{theoretical priors section}
  The prior probability term \( p(\vec{\theta}, \vec{\pi} \given J) \)
in \eqn{conditional posterior} constitutes the fundamental difference
between Bayesian and classical statistics and still fuels debate.  We
will not attempt to defend the use of priors herein but refer the
skeptical reader to Jaynes~\cite{jaynes86} for a full explanation of
the Bayesian approach.
  Introduction of the prior probability solves two problems.  First,
it permits mathematical determination of the number of classes by
introducing a preference for fewer classes.  We believe {\em a
priori}\/ that complex hypotheses are less likely than simple ones,
those with fewer classes, for example, and any reasonable prior
implants this belief in the equations.  A more complex hypothesis will
incur a penalty in the prior and will be disfavored unless it explains
the data significantly better.  Second, the likelihood contains
singularities that would complicate analysis if it were
used alone.  The prior tames the likelihood by damping out the
singularities, and the resulting posterior is much better behaved.
There are two basic types of prior distribution:  {\em informative}\/
priors expressing knowledge about the problem and {\em uninformative}\/
priors expressing ignorance.  Truly uninformative priors introduce no
scale or location preferences and, as much as possible, allow the data
to speak for themselves.  AutoClass~II uses informative priors which
introduce a minimal bias into the classification.  Details appear in
section~\ref{informative priors section}.
%\input{algorithm}
% -*- Mode: TeX -*-
\section{The AutoClass~II Program}
\label{implementation section}
Section \ref{theory section} described the theory behind the Bayesian
approach to classification.  We now outline its implementation, AutoClass~II.
\subsection{The AutoClass~II Class Model}
\label{ac class model section}
%   One of the most important features of the Bayesian approach is its
% flexibility with regard to the nature of the classes it finds.  The
% class distribution functions in equation \ref{pre-likelihood} specify
% the nature of the classes sought.  In theory, functions can be used
% which represent sharply defined or fuzzy classes, symmetric or skewed
% classes, or even distributions not commonly considered classes, as
% described in section \ref{beyond classification section}.
  In AutoClass~II, we assume the data are in an attribute-value vector
form.  That is, the database contains $I$ objects $x_i$, each
described by $K$ attribute values \mbox{$x_{ik}, \; k \member \{1
\ldots K\}$}.  Attributes may be either real or discrete variables.
In AutoClass~II we currently make the further strong assumption that
the attributes are independent in each class.  This permits an
extremely simple form for the class distributions used in equation
\ref{pre-likelihood}:
\begin{equation}
	p(x_i \given x_i \member C_j, \vec{\theta}_j)
	= \prod_{k = 1}^K 
		p(x_{ik} \given x_i \in C_j,\: \vec{\theta}_{jk}),
						\label{model-distribution}
\end{equation}
where $\vec{\theta}_{jk}$ is the parameter vector describing the
$k$th attribute in the $j$th class $C_j$.  We plan to extend AutoClass to
model covariance of the attributes within a class in the near future.
  AutoClass models real valued attributes with a Gaussian normal
distribution, parameterized by a mean and a standard deviation, and
thus $\vec{\theta}_{jk}$ takes the form
  \vec{\theta}_{jk} = \left[ 
  \begin{array}{c}
    \mu_{jk} \\
    \sigma_{jk} 
  \end{array}
  \right].
The class distribution is thus
\begin{equation}
	p(x_{ik} \given x_i \in C_j,\: \mu_{jk}, \sigma_{jk})
 	= \frac{1}{\sqrt{2 \pi} \sigma_{jk}}
		\exp \left[ - \frac{1}{2}
			\left( \frac{ x_{ik} - \mu_{jk}}{\sigma_{jk}}
			\right) ^{2} \right] .		 \label{eq:like2}
\end{equation}
  For discrete attributes the class distribution is specified by the
probability $\rho_{jkl}$ of getting each possible value $l$.  The
elements of $\vec{\theta}_{jk}$ are the probabilities themselves.  If
there are $L$ possible values, labeled 1 to $L$, then the class
distribution is
\begin{equation}
	p(x_{ik} = l \given x_i \in C_j,\: \vec{\rho}_{jk})
	= \rho_{jkl}.
							\label{eq:like1}  
\end{equation}
\subsection{Informative Priors}
\label{informative priors section}
  Although uninformative priors can be derived for use in Bayesian
classification, AutoClass~II currently uses informative priors.  This
is due mostly to the history of the program, and the fact that we have
obtained excellent results using these priors.  The prior information
we use has only a small effect on the classification estimates.
  AutoClass~II employs {\em conjugate} priors, that is, prior
information in the same form as the data.  In effect, the prior
information for attribute $k$ in class $j$ consists of a set of $w'$
fictitious data points, described by the same summary statistics as
are used for the actual data.  The larger the value of $w'$, the
stronger the influence of the prior relative to the data.
AutoClass~II currently treats all classes symmetrically, using the
same conjugate points for every class and for each attribute of the
same type.
%   AutoClass~II assumes {\em a priori\/} independence of the $J$
% class probabilities and each class's parameter vector:
%   p(\vec{\theta}, \vec{\pi} \given \vec{w'}, \vec{\theta'})
%   = \prod_{j = 1}^J
% 	p(\pi_j \given w'_\pi, \pi'_j)
% 	p(\vec{\theta}_j \given \vec{w'}_j, \vec{\theta'}_j)
%   p(\vec{\pi} \given w'_\pi)
%   = {\Gamma(J \, w'_\pi) \over \Gamma(w'_\pi)^J}
%     \prod_{j = 1}^J \pi_j^{w'_\pi - 1}
%   p(\mu_j, \sigma_j \given n_0, \bar{x}_0, s_0)
%   = {2 \over \Gamma \left( {1 \over 2} \right)
%              \Gamma \left( {n_0 - 1 \over 2} \right)}
%     \left( {n_0 \over 2} \right)^{n_0 \over 2}
%     {s_0^{n_0 - 1} \over \sigma_j^{n_0 + 1}}
%     \exp \left(
%          - {1 \over 2}
%            {n_0 \left[ s_0^2 + (\mu_j - \bar{x}_0)^2
%             \over \sigma_j^2 \right]}
%          \right)
% {\em The priors remove the problem of poles and reduce the problem of
% local maxima.}
\subsection{Search Algorithm}
\label{algorithm section}
  As mentioned in section \ref{theory section}, AutoClass breaks the
classification problem into two parts: determining the number of
classes and determining the parameters defining them.  Equation
\ref{marginal posterior} gives the probability distribution over the
number of classes.  For each possible number of classes the
multi-dimensional integral of equation \ref{pseudo likelihood} must be
performed.  Rather than attempt the integration the posterior
distribution of the classification parameters directly, AutoClass
performs a search over $\vec{\theta}_j$ and $\pi_j$ to find the
maximum of the posterior distribution (equation~\ref{conditional
posterior}), and then approximates the integral around that point.
  The complete problem involves starting with more classes than are
beleieved to be present (as specified by the user), searching to find
the best class parameters for that number of classes, approximating
the integral to find the relative probability of that number of
classes, and then decreasing the number of classes and repeating the
procedure.
  AutoClass uses a Bayesian variant of Dempster and Laird's EM
algorithm~\cite{dempster77} to find the best class parameters for a
given number of classes (the maximum of \eqn{conditional posterior}).
To derive the algorithm, we differentiate the posterior distribution
with respect to the class parameters and equate with zero.  This
yields a system of nonlinear equations which hold at the maximum of
the posterior:
\begin{equation}
  \hat{\pi}_j
  = {W_j \; + \; w' - 1
     \over
     I \; + \; J (w' - 1)}			 \label{pi update}
\end{equation}
\begin{equation}
  {\partial \over \partial \theta_j} \ln p(\hat{\theta}_j) +
  \sum_{i = 1}^I w_{ij}
                  {\partial \over \partial \theta_j}
                  \ln p(x_i \given \hat{\theta}_j)
  = 0,						\label{parameter update}
\end{equation}
where $w_{ij}$ is the probability that the datum $x_i$ was drawn from
class $j$ (previously given in equation~\ref{membership probability}),
and $W_j$ is the total weight in class $j$:
\begin{eqnarray*}
      w_{ij} & = & p(x_i \member C_j \given x_i, \hat{\theta}, \hat{\pi})
\\	W_j & =&  \sum_{i=1}^I w_{ij}.
\end{eqnarray*}
  To find a solution to this system of equations, we iterate between
equations \ref{pi update} and \ref{parameter update} (treating
\( \vec{w} \) as a constant ) and equation \ref{membership probability}
(treating \( \vec{\pi} \) and \( \vec{\theta} \) as constants).
  On any given iteration, the membership probabilities are constant,
so \eqn{parameter update} can be simplified by bringing $w_{ij}$
through the derivative, giving
\begin{equation}
  {\partial \over \partial \theta_j}
  \left[
    p(\hat{\theta}_j)
    \prod_{i = 1}^I 
      p(x_i \given \hat{\theta}_j, x_i \member C_j)^{w_{ij}}
  \right]
  = 0.						\label{eq:partial-2} 
\end{equation}
  Thus far, our discussion of the search algorithm has related to a
general class model with an arbitrary $\vec{\theta}_{jk}$.  We now
apply equation \ref{eq:partial-2} to the specific AutoClass~II model
of equations \ref{model-distribution} through \ref{eq:like1}.  For
discrete attributes, the values of the updated parameters
$\hat{\theta}_{jkl}$ derived from the class probabilities $w_{ij}$ and
prior weight $w'$ are
\begin{eqnarray}
  \hat{\rho}_{jkl}
  & = & \frac {\sum_{i=1}^I w_{ij} \delta(l,\,x_{ik}) + w' - 1}
		{W_j + L (w' - 1)}
		\hspace{.25in}
  \delta(l,\,x_{ik}) \: \equiv \:
  \left\{ \begin{array}{ll}
		1, & x_{ik} = l \\
		0, & \mbox{otherwise.}
	  \end{array}  \right.				\label{eq:disc2}
\end{eqnarray}
This is simply the weighted proportion of objects in class \( j \) for
which attribute \( k \) had value \( l \).
  For real valued attributes, the equations for the updated
$\hat{\mu}_{jk}$ and $\hat{\sigma}_{jk}$ are a function of the prior
information and the empirical mean, $\overline{x}_{jk}$, and variance,
$s_{jk}^2$, of the \( k \)th attribute in class j, weighted by
$w_{ij}$:
\begin{eqnarray*}
\bar{x}_{jk} & = & \frac{\sum_{i=1}^I w_{ij} x_{ik}}{W_j}, \\
s_{jk}^2 & = & \frac{\sum_{i=1}^I w_{ij} x_{ik}^2}{W_j}
	- \bar{x}_{jk}^2.
\end{eqnarray*}
The update formulas are then
\begin{eqnarray}
	\hat{\mu}_{jk}
	& = & 
	\frac	{w' \bar{x}'_k + W_j \overline{x}_{jk}}
		{w' + W_j}, \\
	\hat{\sigma}_{jk}^2
	& = &
	\frac	{w' (s_k')^2 + W_j s_{jk}^2}{w' + W_j + 1}
	+ \frac{w' W_j}{(w'+W_j)(w'+W_j+1)}
	\left( \mu'_k - \overline{x}_{jk} \right)^2.
\end{eqnarray}
  The computational cost of a single iteration is of order \mbox{$I
\cdot J \cdot K$}. Typically the procedure converges in about
twenty iterations, depending upon the strength of the actual classes.
A search in data having many weak classes takes longer than one having
few strong classes.  The search may be speeded up by over-relaxation
techniques.  Of course, the procedure may converge to local maxima
depending on the starting point chosen for the iteration, so we employ
heuristic methods to jump away from local maxima.  Even so, many
searches may be necessary to establish the global maximum.
\subsection{Determining the Number of Classes}
  We previously outlined the theory for the determination of the
number of classes present, but integration over the full parameter
space is clearly infeasible.  AutoClass~II uses a crude but very
effective approximation based solely on the results of the iteration
algorithm.  If a class has negligible posterior probability $\pi_j$,
then including that class in the model cannot improve the likelihood
of the data at all.  At the same time, the prior probability of one
class probability being near zero is very low.  Thus models in which a
class has negligible probability will always be less probable than
models which simply omit that class.  The user runs AutoClass with $J$
larger than the expected number of classes.  If all resulting classes
have significant probability then the user increases $J$ until some
classes are empty.  AutoClass then ignores the empty classes, and the
populated classes represent an optimal classification of the data
given the assumed class model function.
  The utility of this approach has been experimentally confirmed on a
number of prepared data bases. Specifically, when AutoClass runs with
$J$ greater than the actual number of classes present, the iteration
converges with negligible probability for the extra classes.  This
behavior differs qualitatively from the behavior of maximum likelihood
methods, which will continue to partition the classes until eventually
there is only one object in each class.
%\input{extensions}
% -*- Mode: TeX -*-
\section{Extensions to the Model}
\label{extensions section}
\subsection{Hierarchical Classification}
\label{hierarchical classification section}
After a database has been analyzed, many classes frequently have many
attributes in common.  For instance, in a database of mammals, the
``dog'' class and the ``cat'' class will be described by the same
values for many attributes (fur, four legs, etc.).  In this case, a
description of all the attributes of all the classes separately is not
as useful as a {\em hierarchical\/} classification scheme which
identifies the common attributes and the distinguishing attributes
between classes.
  The Bayesian method can accomodate hierarchical classification by
considering a model in which some attributes are common to a group of
sub-classes.  This amounts to a significance test of the equality of
two attribute's parameter vectors in the non-hierarchical
classification.  If there is no significant difference between some
attributes of a group of classes, then these attributes may be
estimated jointly.
% The Bayesian method can accommodate hierarchical classification with
% little difficulty, because identification of the low-level classes is
% independent of concerns about the hierarchical structure.  Once the
% classes have been found, class definitions can be partially combined
% to reflect shared attribute values.  The criteria for forming the
% hierarchy are subjective and should be tailored to the needs of the
% user.  This is done more cleanly as a post-processing step than during
% the initial search for classes.
\subsection{Supervised Classification}
\label{supervised classification section}
Although AutoClass was designed for automatic unsupervised
classification, the prior information in equation
\ref{conditional posterior} permits supervised classification as well.
If the user wishes to assert that certain objects are in certain
classes, a prior probability can be used which favors class
descriptions reflecting this.  See Duda and Hart \cite{duda73} for a
discussion of supervised Bayesian inference.
\subsection{Missing Data}
\label{missing data section}
Consistent probability calculations require that `unknown' be treated as
a valid data value.    This is not merely a computational convenience.
Failure to determine a value is just as valid an observation as any
other, and must be allowed for in any predictive model.  There may
be physical reasons that a value is unknown, and discarding that
fact (by interpolating a value or, even worse, discarding the object
completely) destroys potentially valuable information.
A straightforward extension of the class model allows AutoClass to
accept objects with unknown values.  For discrete attributes it can be
shown that the correct procedure for treating an unknown value is
equivalent to adding an `unknown' category to the value set.  For
real-valued attributes we condition our Gaussian normal model with
discrete `unknown' and `known' categories:
\begin{eqnarray}
	\thetasubjk 
	& = & [\rho_{jk}, \musubjk, \sigmasubjk]	\label{eq:like4a}\\
	p(\xsubik \given x_i \in C_j,\: \rho_{jk}, \musubjk, \sigmasubjk)
	& = &  
	\left\{ \begin{array}{ll}
		\rho_{jk} \frac{1}{\sqrt{2 \pi}\: \sigmasubjk}
		\exp \left[ - \frac{1}{2} \left( \frac{\xsubik - \musubjk}{\sigmasubjk}
		\right) ^{2} \right],
		         & \mbox{$\xsubik$ known} \\
		1 - \rho_{jk},  & \mbox{$\xsubik$ unknown}
		\end{array}
	\right.							\label{eq:like4b} 
\end{eqnarray}
The mean and variance are updated as before, but the proportion of
data for which \( x_k \) is known is also updated just like any other
discrete variable.
% \subsection{Integerized Real Data}
% \label{integerized real data section}
% Although the Bayesian method is careful to leave the data inviolate, 
% some data may be in a suboptimal form to begin with.  For example, although
% a person's age is in fact a real value, it is usually rounded down to an
% integer number of years.  If it is treated as a real variable, a class
% whose members are the same age will have zero variance, creating normalization
% problems.  If it is treated as a discrete variable, on the other hand, 
% information about the order of the values will be lost.
% The solution is to specify a range over which the class model function
% is to be integrated, rather than one point at which it is to be
% evaluated.  In other words, we do not specify that a girl's age is five
% years, but that there is a uniform probability of her age falling
% between four and five years.
% \subsection{Beyond Classification}
% \label{beyond classification section}
%    The Bayesian approach to mixture separation applies to more than
% classification; it is the general solution to the problem
% of finding models in data.  Many other models are possible, for
% example, time series models, Markov models, Influence diagrams,
% Decision trees, etc.  Bretthorst \cite{bretthorst87} uses periodic
% models to extract underlying frequencies from extremely noisy signals.
% Self is investigating exponential models to separate
% decay rates in noisy decay spectra.  The Bayesian approach quickly
% reduces these to search problems, at which point efficient search
% algorithms become the goal of research.
%\input{results}
% -*- Mode: Tex -*-
\section{Results} 
\label{results section}
AutoClass has classified data supplied by researchers active in
various domains and has yielded some new and intriguing results:
\begin{description}
\item[$\bullet$ Iris Database] \mbox{}
\nopagebreak
  Fisher's data on three species of iris~\cite{fisher36} are a classic
test for classification systems.  AutoClass discovers the three
classes present in the data with very high confidence, despite the
fact that not all of the cases can be assigned to their classes with
certainty.  Wolfe's NORMIX and NORMAP~\cite{wolfe70} both incorrectly
found four classes, and Dubes's MH index~\cite{dubes87} offers only
weak evidence for three clusters.
\item[$\bullet$ Soybean Disease Database] \mbox{}
\nopagebreak
  AutoClass found the four known classes in Stepp's soybean disease
data, providing a comparison with Michalski's CLUSTER/2
system~\cite{michalski83a}.  AutoClass's class assignments exactly
matched Michalski's---each object belonged overwhelmingly to one
class, indicating exceptionally well separated classes for so small a
database (47 cases, 35 attributes).
\item[$\bullet$ Horse Colic Database] \mbox{}
\nopagebreak
  AutoClass analyzed the results of 50 veterinary tests on 259 horses
and extracted classes which provided reliable disease diagnoses, despite
the fact that almost 40\% of the data were missing.
\item[$\bullet$ Infrared Astronomy Database] \mbox{}
\nopagebreak
  The Infrared Astronomical Satellite tabulation of stellar spectra is
not only the largest database Autoclass has assayed (5,425 cases, 94
attributes) but the least thoroughly understood by domain experts.
AutoClass discovered classes which differed significantly from NASA's
previous analysis but clearly reflect physical phenomena in the data.
% As a example, Figures 1 and 2 show two classes whose spectral
% distributions (Figs.\ 1a and 2a) are very similar, but whose galactic
% coordinate distributions (Figs.\ 1b and 2b) are dramatically
% dissimilar.  Figs. 1a and 2a show normalized spectra of 50 stars from
% each class overlaid on each other, and the curve with error bars is
% the mean spectrum after removing the black body component.  
Note that AutoClass knows nothing about spectra---the current model
treats the intensity at each wavelength as an independent quantity.  As
a result AutoClass would find exactly the same classes if the order of
the wavelengths were scrambled.  The AutoClass infrared source
classification is the basis of a new star catalog to appear shortly.
\end{description}
We are actively collecting and analyzing other databases which seem
appropriate for classification, including an AIDS database and a second
infrared spectral database.
%\input{conclu}
\section{Conclusion}
\label{conclusion section}
  This paper has described the Bayesian approach to the problem of
classification and AutoClass, a simple implementation of it.  Bayesian
probability theory provides a simple and extensible approach to
problems such as classification and general mixture separation.  Its
theoretical basis is free of {\em ad hoc\/} quantities, and in
particular free of any measures which alter the data to suit the needs
of the program.  As a result, the elementary classification model we
have described lends itself easily to extensions.
%   If we are ever to realize the long range goal of true artificial
% intelligence, programs must be written on a much grander scale than
% the prototypical programs on which the AI community is working today.
% We hope to scale up today's techniques to build tomorrow's programs,
% and therefore the simplicity and extensibility of our approaches to AI
% problems must be our paramount concerns.  Although it may not be
% immediately evident to the mathematically faint of heart, we believe
% the Bayesian approach we have outlined has this simplicity and will
% fill an important role in AI in the future.
%\bibliographystyle{plain}
%\bibliography{autoclass}
\begin{thebibliography}{10}
\bibitem{dempster77}
A.~P. Dempster, N.~M. Laird, and D.~B. Rubin.
\newblock Maximum likelihood from incomplete data via the {EM} algorithm.
\newblock {\it Journal of the Royal Statistical Society, Series B},
  39(1):1--38, 1977.
\bibitem{dillon84}
W. Dillon and M. Goldstein.
\newblock {\it Multivariate Analysis: Methods and Applications}, chapter~3.
\newblock Wiley, 1984.
\bibitem{dubes87}
Richard~C. Dubes.
\newblock How many clusters are best? --- an experiment.
\newblock {\it Pattern Recognition}, 20(6):645--663, 1987.
\bibitem{duda73}
Richard~O. Duda and Peter~E. Hart.
\newblock {\it Pattern Recognition and Scene Analysis}.
\newblock Wiley-Interscience, 1973.
\bibitem{everitt81b}
B.~S. Everitt and D.~J. Hand.
\newblock {\it Finite Mixture Distributions}.
\newblock {\it Monographs on Applied Probability and Statistics}, Chapman and
  Hall, London, England, 1981.
\newblock Extensive Bibliography.
\bibitem{fisher87}
D.~H. Fisher.
\newblock Conceptual clustering, learning from examples, and inference.
\newblock In {\it Proceedings of the Fourth International Workshop on Machine
  Learning}, pages~38--49, Morgan Kaufmann, 1987.
\bibitem{fisher36}
R.~A. Fisher.
\newblock Multiple measurments in taxonomic problems.
\newblock {\it Annals of Eugenics}, VII:179--188, 1936.
\bibitem{jaynes86}
Edwin~T. Jaynes.
\newblock {Bayesian} methods: general background.
\newblock In James~H. Justice, editor, {\it Maximum Entropy and {Bayesian}
  Methods in Applied Statistics}, pages~1--25, Cambridge University Press,
  Cambridge, Massachusetts, 1986.
\bibitem{michalski83a}
Ryszard~S. Michalski and Robert.~E. Stepp.
\newblock Automated construction of classifications: conceptual clustering
  versus numerical taxonomy.
\newblock {\it IEEE Transactions on Pattern Analysis and Machine Intelligence},
  PAMI-5:396--410, 1983.
\bibitem{quinlan87}
J.~R. Quinlan.
\newblock Decision trees as probabilistic classifiers.
\newblock {\it Proceedings of the Fourth International Workshop on Machine
  Learning}, 31--37, 1987.
\bibitem{wolfe70}
John~H. Wolfe.
\newblock Pattern clustering by multivariate mixture analysis.
\newblock {\it Multivariate Behavioural Research}, 5:329--350, July 1970.
\bibitem{wong87}
A. Wong and D. Chiu.
\newblock Synthesizing statistical knowledge from incomplete mixed-mode data.
\newblock {\it IEEE Transactions on Pattern Analysis and Machine Intelligence},
  PAMI-9:796--805, 1987.
\end{thebibliography}
\end{document}